Workflow

Assembly genome with MaSurCa

Make config file

# quick run configuration file
DATA
PE = pe 350 50 Mp_R1.fastq.gz Mp_R2.fastq.gz
END
PARAMETERS
EXTEND_JUMP_READS=0
GRAPH_KMER_SIZE = auto
USE_LINKING_MATES = 1
USE_GRID=0
GRID_ENGINE=SLURM
GRID_QUEUE=m3tb
LHE_COVERAGE=25
CA_PARAMETERS =  cgwErrorRate=0.15
CLOSE_GAPS=1
NUM_THREADS = 48
JF_SIZE = 250000000000
SOAP_ASSEMBLY=1
FLYE_ASSEMBLY=0
END

Run MaSurca

module load masurca/4.0.5

# The job command(s)

masurca config.txt
./assemble.sh

Total scaffolds: 4713

Mapping raw reads

bowtie/2.3.4
samtools/1.9.0
bbtools/38.37

#Index bowtie2
bowtie2-build --threads 16 scaffolds.fasta masurca

#Run mapping
bowtie2 --no-unal -p 16 -x masurca -1 Mp_R1.fastq.gz -2 Mp_R2.fastq.gz -S masurca.sam &>bowtie.log

#index scaffolds
samtools faidx scaffolds.fasta

#Convert sam to bam
samtools view -@ 16 -bt scaffolds.fasta masurca.sam | samtools sort -@ 16 -o masurca.bam

#coverage info per scaffold
pileup.sh -Xmx30g in=bmasurca.bam ref=scaffolds.fasta out=mapping_stats_masurca.txt

Cleaning genome

Cleaning with taxon classification using Blobtools v1

The filter steps were adapted from Iha et al. 2021

Make custom database

I used non-redundant (NR) database from GenBank.

Download prot.accession2taxid.gz

Get taxonID for the selected lineages:

  • 2 -> bacteria

  • 2157 -> archeae

  • 4751 -> fungi

  • 10239 -> virus

  • 33634 -> Stramenopiles

    • 2696291 -> Ochrophyta
    • 2870 -> Phaeophyceae
    • 2836 -> Bacillariophyta
  • 2763 -> Rhodophyta

  • 3041 -> Chlorophyta

#Using TaxonKit (https://bioinf.shenwei.me/taxonkit/)

# Create a list of taxids
taxonkit list --ids 2,2157,4751,10239,33634,2763,3041 --indent "" > lineages.taxid.txt

# Total taxIDs:
wc -l lineages.taxid.txt

# Retrieving target accessions
pigz -dc prot.accession2taxid.gz | csvtk grep -t -f taxid -P lineages.taxid.txt | csvtk cut -t -f accession.version,taxid | sed 1d > lineages.acc2taxid.txt

cut -f 1 lineages.acc2taxid.txt > lineages.acc.txt

# Target retrieved:
wc -l lineages.acc.txt

Split lineages.acc.txt in many files

Retrieving FASTA from NR - using array in SLURM

#SBATCH --output=array_%A-%a.out
#SBATCH --array=0-1744

# Retrieving FASTA

blastdbcmd -db nr -entry_batch lineages.acc_${SLURM_ARRAY_TASK_ID} -out - | pigz -c > nr_${SLURM_ARRAY_TASK_ID}.db.fa.gz

Concatenate all sequences to create a unique database

cat nr_*.db.fa.gz > nr.db.fa.gz

# Counting sequences:
pigz -dc nr.db.fa.gz | grep '>' | wc -l

Prepare data to Blobtools v1:

blast+/2.12.0
diamond/2.0.4
samtools/1.9.0

#Create database with Uniprot
diamond makedb --in uniprot_ref_proteomes.fasta -d uniprot_ref_proteomes.diamond

#Running diamond to Uniprot
diamond blastx --query scaffolds.fasta --max-target-seqs 10 --sensitive --threads 16 --db uniprot_ref_proteomes.diamond.dmnd --evalue 1e-25 --outfmt 6 --out masurca.vs.uniprot.1e25.diam.out

#Running Blast to Silva
blastn -task megablast -query scaffolds.fasta -db database/Silva/silva.rDNA.fasta -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 16 -evalue 1e-65 -out masurca.vs.silva.1e65.megablast.out

#Running Blast to Custom database
#Making db to custom database
gunzip database/Custom_db/nr.db.fa.gz
diamond makedb -p 16 --in database/Custom_db/nr.clean.db.fa -d database/Custom_db/nr.db.fa.diamond

#Running Diamond to Custom database
diamond blastx --query scaffolds.fasta --max-target-seqs 10 --sensitive --threads 16 --db database/Custom_db/nr.db.fa.diamond.dmnd --evalue 1e-20 --outfmt 6 --out masurca.vs.customdb.1e20.diamond.out

#taxify for Uniprot
blobtools taxify -f masurca.vs.uniprot.1e25.diam.out -m database/Uniprot/uniprot_ref_proteomes.taxids -s 0 -t 2

#taxify for Silva
blobtools taxify -f masurca.vs.silva.1e65.megablast.out -m database/Silva/silva_ref_rRNA.taxids -s 0 -t 2

#taxify for Custom database
blobtools taxify -f masurca.vs.customdb.1e20.diamond.out -m database/Custom_db/lineages.acc2taxid.txt -s 0 -t 1

#Concat all hits
cat masurca.vs.uniprot.1e25.diam.taxified.out masurca.vs.silva.1e65.megablast.taxified.out masurca.vs.customdb.1e20.diamond.taxified.out > hits.tsv

Run Blobtools

blobtools create -i scaffolds.fasta -t hits.tsv --nodes blobtools/nodes/nodes.dmp --names blobtools/nodes/names.dmp -b masurca.bam -o masurca_blobtools

#Running Blobtools view
blobtools view -i masurca_blobtools.blobDB.json -o masurca_blobview

#Running Blobtool plot
blobtools plot -i  masurca_blobtools.blobDB.json

BlobTools plot

Removing Eukaryotes scaffolds that are distant of Stramenopiles

cut -f6 masurca_blobview.blobdb.blobDB.bestsum.table.txt | sort -u > 00_taxa_to_remove
#manipulate on vi

Kept only the bold taxa:

Actinobacteria Arthropoda Bacillariophyta Bacteria-undef Bacteroidetes Balneolaeota Basidiomycota Candidatus Tectomicrobia Chloroflexi Chlorophyta Cyanobacteria Eukaryota-undef Evosea Firmicutes Gemmatimonadetes no-hit Oomycota Planctomycetes Proteobacteria Streptophyta undef Verrucomicrobia

grep -f 00_taxa_to_remove masurca_blobview.blobdb.blobDB.bestsum.table.txt | cut -f1 > 00_scaffolds_to_remove_byTaxa.ID

19 scaffolds to remove

grep '>' scaffolds.fasta | grep -v -w -f 00_scaffolds_to_remove_byTaxa.ID | sed 's/>//' > 00_scaffolds_to_keep.ID

seqtk subseq scaffolds.fasta 00_scaffolds_to_keep.ID > 00_scaffolds.clean.fasta
#4694 scaffolds

Get scaffolds that matched with Ectocarpus.

Taxonomic ID: 2880 Ectocarpus siliculosus 867726 Ectocarpus sp. CCAP 1310/34

grep -w -f 00_all_scaffolds.ID ../blobtools/masurca.vs.customdb.diamond.blastx.out | grep -e '2880' -e '867726' | cut -f1 | sort -u > 01_all_Ectocarpus_scaffolds.ID

grep -w -f 01_all_noEctocarpus_scaffolds.ID masurca_blobview.blobdb.blobDB.bestsum.table.txt | grep 'Eukaryota' | cut -f1 > 01_eukaryota.ID

#check this IDs in Uniprot
grep -f 01_eukaryota.ID ../blobtools/masurca.vs.uniprot.1e25.diam.taxified.out

#Total 1355 scaffolds matched with Ectocarpus

# Scaffols do not match with Ectocarpus
grep -v -w -f 01_all_Ectocarpus_scaffolds.ID 00_all_scaffolds.ID > 01_noEctocarpus.ID
# 3339 do not match with Ectocarpus

Using transcriptomes to clean the genome

minimap2 -ax splice -C 5 --splice-flank=no --secondary=no -t 16 scaffolds.cleantaxaless1000l.fasta all_transcripts.fasta > mapped_transcripts.sam

#transform in paf format to get CIGAR string info for introns
./paftools.js sam2paf mapped_transcripts.sam > mapped_transcripts.paf

CIGAR string explained:

cg:Z:130M11864N224M3939N188M –> N means introns!

2586 scaffold presents introns

Calculate outliers Script on R –> Outrliers.R

Final genome: scaffolds.clean.fasta

3864 scaffolds

Assembly Transcriptomes

We used two transcriptomes downloaded from GenBank

Macrocystis integrifolia transcriptome: PRJNA322132

Reference: https://link.springer.com/article/10.1007/s13205-018-1204-4#Sec7

RNAseq for the brown alga Macrocystis pyrifera: PRJNA353611

Download data from GenBank

prefetch <SRA_accession_number>
fastq-dump --split-3 <SRA_accession_number> #Descompress SRA and split in pair-ends 

Check quality with FastQC and trim reads with Trimmomatic.

Two modes were used: de novo and “genome-guided” Trinity

De novo

Trinity \
        --seqType fq \
        --max_memory 250G \
        --samples_file samples.txt \
        --CPU 20 \
        --output trinity_out_dir

Genome guided

module load star/2.7.9a
module load samtools/1.12
module load trinity/2.12.0

#Genome index
STAR --runThreadN 16 --runMode genomeGenerate --genomeDir genome --genomeFastaFiles scaffolds.clean.fasta --genomeSAindexNbases 11

#Mapping reads
STAR --runThreadN 16 --genomeDir genome --readFilesManifest manifest.tsv

#Sam to bam
samtools view -@ 8 -b -t scaffolds.clean.fasta Aligned.out.sam | samtools sort -@ 8 -o Aligned.out.bam

#assembly
Trinity --genome_guided_bam Aligned.out.bam \
        --genome_guided_max_intron 10000 \
        --max_memory 250G \
        --CPU 20
PRJNA322132 PRJNA353611
De novo Trinity-DNP.fasta Trinity-DNR.fasta
124436 310167
Genome Guide Trinity-GGP.fasta Trinity-GGR.fasta
14586 34452

Concat all transcriptomes.

Genome annotation

Workflow based on Chen et al. 2019

https://github.com/TimothyStephens/Dinoflagellate_Annotation_Workflow

Clean Transcriptomes

Univec database

Univec will be used to clean transcriptomes

#blast+/2.12.0
#pasa/2.5.1

mkdir UNIVEC/
cd UNIVEC/

wget ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec
wget ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec_Core
wget ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/README.uv


makeblastdb  -in UniVec -dbtype nucl
makeblastdb  -in UniVec_Core -dbtype nucl

seqclean all_transcripts.fasta -v UNIVEC/UniVec_Core -c 4

#Cleaned transcript: all_transcripts.fasta.clean
# 483627 transcripts

Repeat analysis


#repeatmasker/4.1.2p1
#rmblast/2.11.0
#repeatmodeler/2.0.2a


### If need TRF ####
#Download Tandem Repeats Finder
#https://tandem.bu.edu/trf/trf409.linux64.download.html
mv trf409.legacylinux64 trf
chmod +x trf
mv trf ~/bin
####################

export GENOME_NAME=scaffolds.clean.fasta
export GENOME_SIZE=51923561

#Running Repeat Modeler

BuildDatabase -name ${GENOME_NAME}_db -engine ncbi ${GENOME_NAME} > BuildDatabase.out
RepeatModeler -engine ncbi -pa ${SLURM_NTASKS} -database ${GENOME_NAME}_db 1> RepeatModeler.sdtout 2> RepeatModeler.sdterr

# Combine repeat modeler and repeat masker libraries.
cp RM_*/consensi.fa* .
cat consensi.fa.classified RepeatMasker.lib > ${GENOME_NAME}_CombinedRepeatLib.lib

#### Run Repeat Masker

RepeatMasker -lib ${GENOME_NAME}_CombinedRepeatLib.lib -e ncbi -gff -x -no_is -a -pa ${SLURM_NTASKS} ${GENOME_NAME} 1> RepeatMasker.sdtout 2> RepeatMasker.sdterr


# SoftMask Genome
maskFastaFromBed -soft -fi ${GENOME_NAME} -fo ${GENOME_NAME}.softmasked -bed ${GENOME_NAME}.out.gff

# Get repeat features in gff3 format (used for EVM later on)
rmOutToGFF3.pl ${GENOME_NAME}.out > ${GENOME_NAME}.out.gff3

# Model repeat landscape (not needed for annotation)
calcDivergenceFromAlign.pl -s ${GENOME_NAME}.divsum ${GENOME_NAME}.align
createRepeatLandscape.pl -div ${GENOME_NAME}.divsum -g ${GENOME_SIZE} > ${GENOME_NAME}.html

PASA

module load perl/5.32.1
module load blat/3.6
module load samtools/1.12
module load pasa/2.5.1

# The job command(s):
export PASA=/apps/pasa/2.5.1/
export GENOME_NAME=scaffolds.clean.fasta
export GENOME_SIZE=51923561
export MAX_INTRON_LENGTH=70000
export SCRIPTS=/scratch1/iha002/WGS_Illumina_AGRF/AGRF_CAGRF21056777_HGHMYDRXY/annotation/Workflow/Dinoflagellate_Annotation_Workflow/scripts

Launch_PASA_pipeline.pl -c alignAssembly.config -C -R -g ${GENOME_NAME} --MAX_INTRON_LENGTH ${MAX_INTRON_LENGTH} --ALIGNERS blat --CPU ${SLURM_NTASKS} -T -t all_transcripts.fasta.clean -u all_transcripts.fasta > pasa.log 2>&1

perl $PASA/scripts/pasa_asmbls_to_training_set.dbi --pasa_transcripts_fasta ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta --pasa_transcripts_gff3 ${GENOME_NAME}_pasadb.sqlite.pasa_assemblies.gff3

perl ${SCRIPTS}/GeneStats.pl ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.cds ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 ${GENOME_NAME} > ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.cds.stats

Blast PASA output to NR

mkdir blast2db

#Link proteins predicted
ln -s scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep .

## Get sequences which are type:complete.
grep '>' ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.pep | awk '{print $1}' | sed -e 's/>//' > Complete_Sequences.ids

## Get all seq IDs that have only  one CDS.
awk '$3=="CDS"{print $0}' ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 | awk '{ print $9}' | sed -e 's/.*Parent=\(.*\)/\1/' | sort | uniq -c | awk '{ if($1==1) print $2 }' > Single_CDS_Genes.ids

## Get seq IDs which have coords on the genome.
awk '$3=="mRNA"{print $0}' ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 | awk '{ print $9 }' | sed -e 's/ID=\(.*\);Parent.*/\1/' > Genes_with_genome_coords.ids

## Filter IDs

## Get seq IDs that are NOT Single Exon genes, have genome coords, and are type complete. 
## The later GoldenGenes stage will filter these genes anyway so we might as well filter them out now before the intensive blast stage.

python2 ../Dinoflagellate_Annotation_Workflow/scripts/filter_ids.py -i Complete_Sequences.ids -o Complete_Sequences.ids.filtered -k Genes_with_genome_coords.ids -r Single_CDS_Genes.ids

## Get pep sequences in filtered ID list
xargs samtools faidx ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.pep < Complete_Sequences.ids.filtered > ${GENOME_NAME}_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered

mkdir fasta_split/

seqkit split2 -s 50 scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered
ls -1 fasta_split/*.filtered > files2run.txt

Run BLAST array

#SBATCH --array=0-48

# The modules to load:
module load blast+/2.12.0
module load bioref/default

# The job command(s):

if [ -d "$INDIR" ]
then
        SAMPLES=( `ls -1 ${INDIR}/*.filtered | sed -E 's/.+\/(.+)\.filtered/\1/' ` );

        if [ ! -z "$SLURM_ARRAY_TASK_ID" ]
        then
                i=$SLURM_ARRAY_TASK_ID
                blastp -query ${INDIR}/${SAMPLES[$i]}.filtered -db nr  -out ${INDIR}/${SAMPLES[$i]}.blastp.outfmt6 -num_threads ${SLURM_NTASKS} -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
        else
                echo "Error: Missing array index as SLURM_ARRAY_TASK_ID"
        fi
else
        echo "Error: Missing input file directory as --export env INDIR or doesn't exist"
fi

Sorting results

cat FASTA_SPLIT/*.outfmt6 > blastp.outfmt6

cat blastp.outfmt6 | awk -F "\t" '$11<1e-20' | awk -F "\t" '{ if (( (($8-$7)+1) / $13) > 0.8) {print $1}}' | sort | uniq | xargs samtools faidx scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered > scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage_onlyquery.faa  #### 1624 sequences

Transposon-PSI

Download Transposon-PSI Download blastall. The program use formatdb.

Create a ~/.ncbirc on home:

[NCBI]
data=<full-path-to-TransposonPSI-data>/Transposon-PSI/blast-2.2.26/data

perl TransposonPSI_08222010/transposonPSI.pl scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.faa prot

cat scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.faa.TPSI.allHits | awk '{ print $5 }' | sort | uniq > TransposonPSI.hit.seq.ids

CD-HITs


mkdir CDHITS
cd CDHITS/

grep '>' scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.faa | sed -e 's/>//' > Seqs_from_blast.ids

cp ../Dinoflagellate_Annotation_Workflow/scripts/filter_ids.py .

python2 filter_ids.py -i Seqs_from_blast.ids -o Seqs_from_blast.ids.filtered -r ../Transposon-PSI/TransposonPSI.hit.seq.ids

module load samtools/1.12

xargs samtools faidx scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.faa < Seqs_from_blast.ids.filtered > scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa


module load cd-hit/4.8.1
cd-hit \
   -i scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa \
   -o scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa.cdhit75 \
   -c 0.75 \
   -n 5 \
   -T 16 \
   -M 30000


grep '>' scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa.cdhit75 | sed -e 's/>//' > scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa.cdhit75.ids

Golden Genes


git clone https://github.com/genomecuration/JAMg.git
date > JAMg/date_retrieved.txt

mkdir Prepare_Golden_Genes

cp JAMg/date_retrieved.txt Prepare_Golden_Genes/
cp JAMg/bin/prepare_golden_genes_for_predictors.pl Prepare_Golden_Genes/prepare_golden_genes_for_predictors.pl
cp JAMg/bin/run_exonerate.pl Prepare_Golden_Genes/run_exonerate.pl
cp -r JAMg/PerlLib Prepare_Golden_Genes/

export PERL5LIB=$PERL5LIB:"/home/iha002/iha002/WGS_Illumina_AGRF/AGRF_CAGRF21056777_HGHMYDRXY/annotation/Prepare_Golden_Genes/PerlLib"

#Install BIO
perl -MCPAN -Mlocal::lib -e shell
cpan[1]> install Bio::Perl

#Install cdbfasta
git clone https://github.com/gpertea/cdbfasta.git
make
cd ~/bin
ln -s ../iha002/WGS_Illumina_AGRF/AGRF_CAGRF21056777_HGHMYDRXY/annotation/Prepare_Golden_Genes/cdbfasta/cdbfasta
ln -s ../iha002/WGS_Illumina_AGRF/AGRF_CAGRF21056777_HGHMYDRXY/annotation/Prepare_Golden_Genes/cdbfasta/cdbyank

## Check it works
module load perl/5.32.1
module load blast+/2.12.0
module load gmap/20210825
module load samtools/1.12
module load exonerate/2.2.0
module load augustus/3.4.0
module load snap-korflab/211201
module load trinity/2.12.0
module load emboss/6.6.0


export PERL5LIB=$PERL5LIB:"<PATH-to-PerlLIB>/Prepare_Golden_Genes/PerlLib"

perl prepare_golden_genes_for_predictors.pl \
        -genome scaffolds.clean.fasta.masked \
        -softmasked scaffolds.clean.fasta.softmasked \
        -no_gmap -same_species -intron 7000 -cpu 16 -norefine -complete -no_single -verbose \
        -pasa_gff *.assemblies.fasta.transdecoder.gff3 \
        -pasa_peptides *.assemblies.fasta.transdecoder.pep \
        -pasa_cds *.assemblies.fasta.transdecoder.cds \
        -pasa_genome *.assemblies.fasta.transdecoder.genome.gff3 \
        -pasa_assembly *.assemblies.fasta

Running golden genes script

mkdir Golden_genes/

ln -s ../repeat/scaffolds.clean.fasta.masked .
ln -s ../repeat/scaffolds.clean.fasta.softmasked .
ln -s ../CDHITS/scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.pep.complete_only.filtered.top_coverage.final.faa.cdhit75.ids CD-HIT.ids.txt
ln -s ../../Prepare_Golden_Genes/prepare_golden_genes_for_predictors.pl .

## Get PASA output for GoldenSet.
## 
## This script should get just the IDs given in the CD-HIT.ids.txt file in
## the same format as you would expect from PASA. This seems to be required by the 
## prepare_golden_genes script for some reason.
## This script might not work with different versions of PASA if the seq names are in a different format.
python2 ../Dinoflagellate_Annotation_Workflow/scripts/get_PASA_for__prepare_golden_genes.py --ids CD-HIT.ids.txt --pasa_assembly ../PASA/*.assemblies.fasta --pasa_cds ../PASA/*.assemblies.fasta.transdecoder.cds --pasa_peptides ../PASA/*.assemblies.fasta.transdecoder.pep --pasa_gff ../PASA/*.assemblies.fasta.transdecoder.gff3 --pasa_genome ../PASA/*.assemblies.fasta.transdecoder.genome.gff3

Run golden_genes.script
# I had to install Parafly

GeneMark-ES

perl gmes_linux_64/gmes_petap.pl --ES --cores 16 --format GFF3 --v --sequence scaffolds.clean.fasta.masked 1>&2> genemark.log

perl ${MAKER}/genemark_gtf2gff3 genemark.gtf > genemark.gff3

SNAP

mkdir SNAP

ln -s ../repeat/scaffolds.clean.fasta.softmasked .
ln -s ../Golden_genes/final_golden_genes.gff3.nr.golden.zff .

module load samtools/1.12
module load snap-korflab/211201

grep '>' final_golden_genes.gff3.nr.golden.zff | sed 's/>\(.*\)/\1/' | xargs samtools faidx scaffolds.clean.fasta.softmasked > snap.fasta

fathom final_golden_genes.gff3.nr.golden.zff snap.fasta -gene-stats
#118 sequences
#0.506642 avg GC fraction (min=0.465218 max=0.544290)
#142 genes (plus=77 minus=65)
#0 (0.000000) single-exon
#142 (1.000000) multi-exon
#190.731445 mean exon (min=3 max=1576)
#1170.067871 mean intron (min=90 max=6611)

fathom final_golden_genes.gff3.nr.golden.zff snap.fasta -validate > validate.txt

fathom final_golden_genes.gff3.nr.golden.zff snap.fasta -categorize 1000

cat uni.ann wrn.ann > com.ann
cat uni.dna wrn.dna > com.dna

fathom -export 1000 -plus com.ann com.dna

forge export.ann export.dna

hmm-assembler.pl -c 0.000 scaffolds.clean.fasta . > scaffolds.clean.fasta.snap.hmm

mkdir PREDICT; cd PREDICT/

ln -s ../scaffolds.clean.fasta.softmasked .

snap ../scaffolds.clean.fasta.snap.hmm scaffolds.clean.fasta.softmasked -lcmask -quiet 1> stdout.snap 2> stderr.snap

perl ../../Dinoflagellate_Annotation_Workflow/scripts/SNAP_output_to_gff3.pl stdout.snap scaffolds.clean.fasta.softmasked 1> snap.gff3 2> snap.gff3.strerr

AUGUSTUS

I didn’t used Training mode


mkdir PREDICTION

module load augustus/3.4.0

ln -s ../../repeat/scaffolds.clean.fasta.softmasked .

cp ../../Dinoflagellate_Annotation_Workflow/scripts/fasta-splitter.pl .
#Had to install File::Util - perl module

perl fasta-splitter.pl --n-parts 100 --out-dir FASTA_SPLIT ${GENOME_NAME}.softmasked

augustus --softmasking=1 --gff3=on --UTR=off --codingseq=on --protein=on --exonnames=on --species=${SPECIES} ${INDIR}/${SAMPLES[$i]}.softmasked > ${INDIR}/${SAMPLES[$i]}.augustus.out

cat FASTA_SPLIT/*.out | join_aug_pred.pl > augustus.gff3
getAnnoFasta.pl augustus.gff3
#augustus3.aa
#augustus3.codingseq

#Install Evidence modeler
export PERL5LIB=$PERL5LIB:"/home/iha002/sw/EVidenceModeler-1.1.1/PerlLib"
cp ~/sw/EVidenceModeler-1.1.1/EvmUtils/misc/augustus_GFF3_to_EVM_GFF3.pl .

perl augustus_GFF3_to_EVM_GFF3.pl augustus.gff3 | sed '/^$/d' > augustus.cleaned.gff3
perl GeneStats.pl augustus3.codingseq augustus.cleaned.gff3 ../../scaffolds.clean.fasta > augustus.cleaned.gff3.stats

Evidence Modeler (Finally!!!)

Prepare data:


mkdir EVIDENCE_MODELER

cd EVIDENCE_MODELER/

ln -s ../scaffolds.clean.fasta .
ln -s ../GeneMark/genemark.gff3 .
ln -s ../SNAP/PREDICT/snap.gff3 .
ln -s ../AUGUSTUS/augustus.cleaned.gff3 .
ln -s ../PASA/scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 .
ln -s ../repeat/scaffolds.clean.fasta.out.gff .

grep -v "^#" genemark.gff3 > abinitio_gene_predictions.gff3
grep -v "^#" snap.gff3 | sed '/^$/d' | awk '{print $1 "\tSNAP\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" $7 "\t"$8 "\t" $9}' >> abinitio_gene_predictions.gff3
cat augustus.cleaned.gff3 >> abinitio_gene_predictions.gff3
sed '/^$/d' scaffolds.clean.fasta_pasadb.sqlite.assemblies.fasta.transdecoder.genome.gff3 >> abinitio_gene_predictions.gff3

Make weights.txt

ABINITIO_PREDICTION     GeneMark.hmme 2
ABINITIO_PREDICTION     SNAP    2
ABINITIO_PREDICTION     Augustus        6
OTHER_PREDICTION        transdecoder    10

Run Evidence Modeler


mkdir PARTITIONS_EVM
cd PARTITIONS_EVM/

ln -s ../abinitio_gene_predictions.gff3 .

partition_EVM_inputs.pl --genome scaffolds.clean.fasta --gene_predictions abinitio_gene_predictions.gff3 --segmentSize 50000000 --overlapSize 10000 --partition_listing partitions_list.out

write_EVM_commands.pl --genome scaffolds.clean.fasta --gene_predictions abinitio_gene_predictions.gff3 --weights weights.txt --output_file_name evm.out --partitions partitions_list.out > commands.list

ParaFly -v -CPU 16 -c commands.list -failed_cmds commands.list.failed

recombine_EVM_partial_outputs.pl --partitions partitions_list.out --output_file_name evm.out

convert_EVM_outputs_to_GFF3.pl --partitions partitions_list.out --output evm.out --genome /scratch2/iha002/Macrocystis_annotation/scaffolds.clean.fasta

cd ../

find PARTITIONS_EVM/ -name evm.out.gff3 -exec cat '{}' ; > scaffolds.clean.fasta.evm.gff3


# Get CDS and Proteins.

#sed 's@EVM.evm.TU@EVM.evm.TU@' will remove ^\ (file seperator) character from EVM^\evm.TU to EVM.evm.TU

gff3_file_to_proteins.pl scaffolds.clean.fasta.evm.gff3 scaffolds.clean.fasta CDS | sed 's@EVM.evm.TU@EVM.evm.TU@' > scaffolds.clean.fasta.evm.cds.fna

gff3_file_to_proteins.pl scaffolds.clean.fasta.evm.gff3 scaffolds.clean.fasta prot | sed 's@EVM.evm.TU@EVM.evm.TU@' > scaffolds.clean.fasta.evm.protein.faa

DONE!